1 Aggregated and atomic scores per method



# datasets = read_yaml("datasets.yml") 
# print(score_file)

# datasets = read_yaml("datasets.yml") 
# datasets = read_yaml(file_dataset) 


list_wd = strsplit(getwd(),'/')[[1]]
# Snakemake script : the current working dir is hadaca3_framework
if(list_wd[length(list_wd)] == 'hadaca3_framework'){
  score_files = list(list.files(path = "./output/scores/", full.names = TRUE))

# nextflow script :
}else{
  score_files = list(list.files(pattern = 'score-li*' ))
}


results_li <- data.frame(
  dataset = character(),
  ref = character(),

  preprocessing_mixRNA = character(),
  feature_selection_mixRNA = character(),

  preprocessing_RNA = character(),
  feature_selection_RNA = character(),

  preprocessing_scRNA = character(),
  feature_selection_scRNA = character(),
  deconvolution_rna = character(),

  preprocessing_mixMET = character(),
  feature_selection_mixMET = character(),

  preprocessing_MET = character(),
  feature_selection_MET = character(),
  deconvolution_met = character(),
  late_integration = character(),
  
  aid = numeric(),
  aid_norm = numeric(),
  aitchison = numeric(),
  aitchison_norm = numeric(),
  jsd = numeric(),
  jsd_norm = numeric(),
  mae = numeric(),
  mae_norm = numeric(),
  pearson_col = numeric(),
  pearson_col_norm = numeric(),
  pearson_row = numeric(),
  pearson_row_norm = numeric(),
  pearson_tot = numeric(),
  pearson_tot_norm = numeric(),
  rmse = numeric(),
  rmse_norm = numeric(),
  score_aggreg = numeric(),
  sdid = numeric(),
  sdid_norm = numeric(),
  spearman_col = numeric(),
  spearman_col_norm = numeric(),
  spearman_row = numeric(),
  spearman_row_norm = numeric(),
  spearman_tot = numeric(),
  spearman_tot_norm = numeric()
)


i = 0 
for (score_file in score_files[[1]]) {
  # Extract the base name of the file

  base_name <- basename(score_file)

  # Extract components from the file name

  components <- str_match(base_name, 
  #       dt   ref  OMIC  ppmR fsmR omic ppR fsR omic  ppSR fsSR  deR   omic  ppmM fsmM omic ppM  fsM  deM  li
  # "score-(.+)_(.+)_mixRNA_(.+)_(.+)_RNA_(.+)_(.+)_scRNA_(.+)_(.+)_(.+)_mixMET_(.+)_(.+)_MET_(.+)_(.+)_(.+)_(.+).h5")[2:16]
  "score-li-(.+)_(.+)_mixRNA_(.+)_(.+)_RNA_(.+)_(.+)_scRNA_(.+)_(.+)_(.+)_mixMET_(.+)_(.+)_MET_(.+)_(.+)_(.+)_(.+).h5")[2:16]
  

  # components <- str_match(base_name, "score-(.+)_(.+)_(.+)_(.+)_(.+)_(.+)_(.+)_(.+)")[2:8]
  scores <- read_hdf5(score_file)
  # Append the extracted information to the results data frame
  results_li <- rbind(results_li,
    cbind(
     data.frame(
       dataset = components[1],
       ref = components[2],

       preprocessing_mixRNA = components[3],
       feature_selection_mixRNA = components[4],

       preprocessing_RNA = components[5],
       feature_selection_RNA = components[6],

       preprocessing_scRNA = components[7],
       feature_selection_scRNA = components[8],
       deconvolution_rna = components[9],

       preprocessing_mixMET = components[10],
       feature_selection_mixMET = components[11],

       preprocessing_MET = components[12],
       feature_selection_MET = components[13],
       deconvolution_met = components[14],

       late_integration = components[15],
       stringsAsFactors = FALSE
     ),
     scores
    ))
  rownames(results_li) = NULL

  i = i +1 
}

results_li %>%
  # filter(dc==2) %>%
  group_by(late_integration) %>%
  summarise(GlobalScore = median(score_aggreg)) %>%
  arrange(desc(GlobalScore))
#> # A tibble: 3 × 2
#>   late_integration GlobalScore
#>   <chr>                  <dbl>
#> 1 OnlyMet                0.663
#> 2 limeanRMSE             0.660
#> 3 OnlyRna                0.646


results_li_arrange = results_li %>%
  group_by(preprocessing_mixRNA, feature_selection_mixRNA, 
           preprocessing_RNA, feature_selection_RNA, 
           preprocessing_scRNA, feature_selection_scRNA, deconvolution_rna, 
           preprocessing_mixMET,feature_selection_mixMET, 
           preprocessing_MET, feature_selection_MET, deconvolution_met, 
           late_integration, .groups = "keep") %>% 
  summarise(GlobalScore = median(score_aggreg)) %>%
  arrange(desc(GlobalScore)) 
#> `summarise()` has grouped output by 'preprocessing_mixRNA',
#> 'feature_selection_mixRNA', 'preprocessing_RNA', 'feature_selection_RNA',
#> 'preprocessing_scRNA', 'feature_selection_scRNA', 'deconvolution_rna',
#> 'preprocessing_mixMET', 'feature_selection_mixMET', 'preprocessing_MET',
#> 'feature_selection_MET', 'deconvolution_met', 'late_integration'. You can
#> override using the `.groups` argument.




all_data_used = c('dataset', 'ref')
for(data_used in all_data_used){
  results_li[[data_used]] = factor(results_li[[data_used]], 
  levels = unique(results_li[[data_used]])) # levels will be alphabeticaly ordered
}



all_functions_li = c('preprocessing_mixRNA', 'feature_selection_mixRNA', 'preprocessing_RNA', 'feature_selection_RNA', 'preprocessing_scRNA', 'feature_selection_scRNA', 'deconvolution_rna', 'preprocessing_mixMET', 'feature_selection_mixMET', 'preprocessing_MET', 'feature_selection_MET', 'deconvolution_met', 'late_integration' )
for( fun in all_functions_li){
  results_li[[fun]] = factor(results_li[[fun]], 
  levels = unique(results_li[[fun]][order(results_li$score_aggreg[results_li$dataset=='invitro1'],decreasing = T)])) # sort based on the results_li on the in vitro dataset
}



index_aggreg <- which(names(results_li) == "score_aggreg")

datatable(
  results_li[, c(1:length(all_functions_li)+2, index_aggreg)],
  extensions = 'Buttons',
  options = list(
    pageLength = 10,
    dom = 'Bfrtip',  # This includes the Buttons extension in the layout
    buttons = list(
      list(
        extend = 'colvis',
        text = 'Show/Hide Columns',
        columns = ':not(:first-child)'  # This allows all columns except the first to be toggled
      )
    )
  )
)

2 Early integration_table

#> # A tibble: 0 × 2
#> # ℹ 2 variables: early_integration <chr>, GlobalScore <dbl>

3 Visualisations of the top methods

3.1 top 5 best methods

3.2 top 5 worst methods

#> # A tibble: 150 × 3
#>    cell_type pred_prop method     
#>    <chr>         <dbl> <chr>      
#>  1 endo         0.156  method top1
#>  2 fibro        0.0841 method top1
#>  3 immune       0.481  method top1
#>  4 classic      0.0400 method top1
#>  5 basal        0.240  method top1
#>  6 endo         0.160  method top1
#>  7 fibro        0.173  method top1
#>  8 immune       0.412  method top1
#>  9 classic      0      method top1
#> 10 basal        0.256  method top1
#> # ℹ 140 more rows
#> # A tibble: 75 × 3
#>    cell_type pred_prop method     
#>    <chr>         <dbl> <chr>      
#>  1 endo          0.245 method top1
#>  2 fibro         0.214 method top1
#>  3 immune        0.423 method top1
#>  4 classic       0     method top1
#>  5 basal         0.119 method top1
#>  6 endo          0.159 method top1
#>  7 fibro         0.145 method top1
#>  8 immune        0.291 method top1
#>  9 classic       0     method top1
#> 10 basal         0.405 method top1
#> # ℹ 65 more rows
#> # A tibble: 120 × 3
#>    cell_type pred_prop method     
#>    <chr>         <dbl> <chr>      
#>  1 endo        0.0393  method top1
#>  2 fibro       0.0890  method top1
#>  3 immune      0.742   method top1
#>  4 classic     0.123   method top1
#>  5 basal       0.00678 method top1
#>  6 endo        0.0309  method top1
#>  7 fibro       0.0673  method top1
#>  8 immune      0.587   method top1
#>  9 classic     0.315   method top1
#> 10 basal       0       method top1
#> # ℹ 110 more rows

4 Visualisations of the different metrics

4.1 Paper figures

4.1.1 PP

4.1.2 FS

4.1.3 DE

4.1.4 LI

4.2 Aggregated scores

4.2.1 PP

4.2.2 FS

4.2.3 DE

4.2.4 LI

4.3 MAE

4.3.1 PP

4.3.2 FS

#> Warning: Removed 576 rows containing non-finite outside the scale range
#> (`stat_ydensity()`).
#> Warning: Removed 576 rows containing non-finite outside the scale range
#> (`stat_ydensity()`).

4.3.3 DE

#> Warning: Removed 576 rows containing non-finite outside the scale range
#> (`stat_ydensity()`).
#> Warning: Removed 576 rows containing non-finite outside the scale range
#> (`stat_ydensity()`).

4.3.4 LI

#> Warning: Removed 576 rows containing non-finite outside the scale range
#> (`stat_ydensity()`).

4.4 RMSE

4.4.1 PP

4.4.2 FS

#> Warning: Removed 576 rows containing non-finite outside the scale range
#> (`stat_ydensity()`).
#> Warning: Removed 576 rows containing non-finite outside the scale range
#> (`stat_ydensity()`).

4.4.3 DE

#> Warning: Removed 576 rows containing non-finite outside the scale range
#> (`stat_ydensity()`).
#> Warning: Removed 576 rows containing non-finite outside the scale range
#> (`stat_ydensity()`).

4.4.4 LI

#> Warning: Removed 576 rows containing non-finite outside the scale range
#> (`stat_ydensity()`).

4.5 Spearman correlation (row)

4.5.1 PP

4.5.2 FS

4.5.3 DE

4.5.4 LI

4.6 Aitchison distance

4.6.1 PP

4.6.2 FS

#> Warning: Removed 576 rows containing non-finite outside the scale range
#> (`stat_ydensity()`).
#> Warning: Removed 576 rows containing non-finite outside the scale range
#> (`stat_ydensity()`).

4.6.3 DE

#> Warning: Removed 576 rows containing non-finite outside the scale range
#> (`stat_ydensity()`).
#> Warning: Removed 576 rows containing non-finite outside the scale range
#> (`stat_ydensity()`).

4.6.4 LI

#> Warning: Removed 576 rows containing non-finite outside the scale range
#> (`stat_ydensity()`).